library(tidyverse)
library(magrittr)
library(lubridate)
library(scales)
library(matrixStats)
library(ggrepel)
library(broom)
library(glue)
library(jsonlite)
library(rvest)
library(RCurl)
library(pander)
library(DT)
library(plotly)
library(cowplot)
panderOptions("big.mark", ",")
panderOptions("table.split.table", Inf)
panderOptions("table.style", "rmarkdown")
panderOptions("missing", "")
theme_set(theme_bw())
# Handle updates between 12am & 9am
dt <- Sys.Date()
if (as.numeric(format(Sys.time(), "%H")) <= 9){
  dt <- Sys.Date() - 1
}
auStates <- c(
  ACT = "Australian Capital Territory",
  QLD = "Queensland",
  NSW = "New South Wales",
  VIC = "Victoria",
  SA = "South Australia",
  WA = "Western Australia",
  NT = "Northern Territory",
  TAS = "Tasmania"
)

Disclaimer: This very simple report was prepared by a bioinformatician with no experience in epidemiology or virology, and as such should be treated simply as an alternate viewpoint on the data, which I was simply unable to find elsewhere. Many other people exist with much greater expertise on this subject. However, I do hope this provides a useful perspective which is able to add constructively to the wider discussion. In addition, it should be noted that this is very much focussed on Australian data.

Data Sources

confirmed <- url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv") %>%
    read_csv() %>%
    pivot_longer(
        cols = ends_with("20"),
        names_to = "date",
        values_to = "confirmed"
    )  %>%
    mutate(
        date = str_replace_all(
            date, "(.+)/(.+)/(.+)", "20\\3-\\1-\\2"
        ) %>%
            ymd()
    ) %>%
    dplyr::rename(
        Country = `Country/Region`
    ) %>%
    dplyr::mutate(
        Country = case_when(
            `Province/State` == "Hubei" ~ "China (Hubei)",
            `Province/State` == "Hong Kong" ~ "Hong Kong",
            grepl("China", Country) & !`Province/State` %in% c("Hubei", "Hong Kong") ~ "China (Other)",
            Country == "Korea, South" ~ "South Korea",
            !grepl("China", Country) ~ Country
        )
    ) %>%
    dplyr::filter(
        !is.na(confirmed)
    ) %>%
  dplyr::select(-Lat, -Long)
latestAU <- list()
nswRecov <- nswTests <- NA_real_
nswUrl <- "https://www.health.nsw.gov.au/Infectious/covid-19/Pages/stats-nsw.aspx"
nswTables <- nswUrl %>%
  read_html() %>%
  xml_find_all("//table") %>%
  html_table() %>%
  .[[1]]
nswTests <- nswTables %>%
  dplyr::filter(
    str_detect(Cases, "Total people tested"),
    !str_detect(Cases, "excluded")
  ) %>%
  .[["Count"]] %>%
  str_remove_all(",") %>%
  as.numeric()
nswRecov <- nswTables %>%
  dplyr::filter(str_detect(Cases, "Recovered")) %>%
  .[["Count"]] %>%
  str_remove_all(",") %>%
  str_extract("[0-9]+") %>%
  as.numeric()
latestAU$NSW <- tibble(
  State = "New South Wales",
  date = dt,
  confirmed = nswTables %>% 
    dplyr::filter(str_detect(Cases, "Total confirmed")) %>%
    .[["Count"]] %>% 
    str_remove_all(",") %>%
    str_extract("[0-9]+") %>% 
    as.numeric(),
  deaths = nswTables %>% 
    dplyr::filter(str_detect(Cases, "Total deaths")) %>%
    .[["Count"]] %>% 
    str_remove_all(",") %>%
    str_extract("[0-9]+") %>% 
    as.numeric(),
  recovered = nswRecov,
  tested = nswTests
)
qldRecov <- qldTests <- NA_real_
qldUrl <- "https://www.qld.gov.au/health/conditions/health-alerts/coronavirus-covid-19/current-status/statistics"
qldTests <- qldUrl %>%
  read_html() %>%
  html_nodes("body") %>%
  xml_find_all("//div[contains(@class, 'tested')]") %>%
  html_text() %>%
  str_extract("[0-9,]+") %>%
  str_remove_all(",") %>%
  as.numeric()
qldTable <- qldUrl %>%
  read_html() %>%
  html_nodes("body") %>%
  xml_find_all("//table[@id = 'QLD_Cases']") %>%
  html_table() %>%
  .[[1]] 
qldRecov <- qldTable %>%
  dplyr::filter(Cases == "Recovered") %>%
  .[["Total"]] %>%
  as.numeric()
latestAU$QLD <- tibble(
  State = "Queensland",
  date = dt,
  confirmed = qldTable %>% 
    dplyr::filter(str_detect(Cases, "Number of confirmed")) %>%
    .[["Total"]] %>% 
    str_remove_all(",") %>%
    str_extract("[0-9]+") %>% 
    as.numeric(),
  deaths = qldTable %>% 
    dplyr::filter(str_detect(Cases, "Deaths")) %>%
    .[["Total"]] %>% 
    str_remove_all(",") %>%
    str_extract("[0-9]+") %>% 
    as.numeric(),
  recovered = qldRecov,
  tested = qldTests
)
vicRecov <- vicTests <- NA_real_
vicUrl <- "https://www.dhhs.vic.gov.au/coronavirus-covid-19-daily-update"
vicTxt <- vicUrl %>%
  read_html() %>%
  html_nodes("body") %>% 
  html_text() %>% 
  str_split("\n") %>% 
  .[[1]] %>% 
  str_trim() %>% 
  .[. != ""] %>% 
    str_subset("(tested|recovered|total|died)")
vicTests <- vicTxt %>%
  str_subset("test(ed|s)") %>%
  .[[1]] %>%
  str_replace_all(
    ".+ ([0-9,]+) .*test(ed|s).+",
    "\\1"
  ) %>%
  str_remove_all(",") %>%
  as.numeric()
vicRecov <- vicTxt %>%
  str_subset("recovered") %>%
  str_trim() %>%
  str_replace_all(".+ ([0-9,]+)[^0-9]people have recovered..+", "\\1") %>%
  str_remove(",") %>%
  as.numeric()
latestAU$VIC <- tibble(
  State = "Victoria",
  date = dt,
  confirmed = vicTxt %>%
    str_subset("coronavirus.+cases") %>% 
    str_replace_all(".+is ([0-9,]+).+", "\\1") %>% 
    str_remove_all(",") %>%
    as.numeric(),
  deaths = vicTxt %>% 
    str_subset("died") %>% 
    str_replace_all(".+ ([0-9,]+) people have died.", "\\1") %>%
    str_remove_all(",") %>% 
    as.numeric(),
  recovered = vicRecov,
  tested = vicTests
)
waTests <- NA_real_
waRecov <- NA_real_
waUrl <- "https://ww2.health.wa.gov.au/Articles/A_E/Coronavirus/COVID19-statistics"
waTable <- waUrl %>%
  read_html() %>%
  xml_find_all("//table") %>%
  .[[1]] %>%
  html_table() %>%
  mutate(
    Total = str_remove_all(Total, ","),
    Total = as.numeric(Total)
  ) 
waTests <- waTable %>%
  dplyr::filter(str_detect(Cases, "ive")) %>%
  summarise(n = sum(Total)) %>%
  .[["n"]]
waRecov <- dplyr::filter(waTable, Cases == "Recovered")$Total
latestAU$WA <- tibble(
  State = "Western Australia",
  date = dt,
  confirmed = waTable %>%
    dplyr::filter(str_detect(Cases, "positive")) %>%
    .[["Total"]],
  deaths = waTable %>%
    dplyr::filter(str_detect(Cases, "Deaths")) %>%
    .[["Total"]],
  recovered = waRecov,
  tested = waTests
)
saRecov <- saTests <- NA_real_
saUrl <- "https://www.covid-19.sa.gov.au/home/dashboard"
saTxt <- saUrl  %>%
  read_html() %>%
  xml_find_all("//div[contains(@class, 'accordion__content')]") %>%
  .[[1]] %>%
  html_text() %>%
  str_split("\r\n") %>%
  .[[1]] %>%
  str_trim() %>%
  .[. != ""]
saTests <- saTxt %>%
  str_replace_all(".+conducted.* ([0-9, ]+) COVID-19 laboratory tests.+", "\\1") %>%
  str_remove_all("[, ]") %>%
  as.numeric()
saTable <- "https://www.covid-19.sa.gov.au/" %>% 
  read_html() %>% 
  html_node("body") %>% 
  xml_find_all("//div[@class = 'focus-boxes']") %>%
  html_text() %>% 
  str_split("\r\n") %>% 
  .[[1]] %>% 
  str_trim() %>% 
  .[.!=""] %>% 
  .[seq(length(.), 1, by = -1)] %>% 
  .[!str_detect(., "(Data current|View the|See more)")] %>%
  matrix(ncol = 2, byrow = TRUE) %>% 
  set_colnames(c("Cases", "Total")) %>% 
  as.data.frame(stringsAsFactors = FALSE) %>% 
  mutate(
    Total = as.numeric(Total),
    Cases = str_remove_all(Cases, "in South Australia")
  )
saRecov <- saTable %>%
  dplyr::filter(str_detect(Cases, "recovered")) %>%
  .[["Total"]]
latestAU$SA <- tibble(
  State = "South Australia",
  date = dt,
  confirmed = saTable %>% 
    dplyr::filter(str_detect(Cases, "Total cases")) %>% 
    .[["Total"]],
  deaths = 4, # Only able to be added manually. Thanks SA Health
  recovered = saRecov,
  tested = saTests
)
actRecov <- actTests <- NA_real_
actUrl <- "https://www.health.act.gov.au/about-our-health-system/novel-coronavirus-covid-19"
actTable <- actUrl %>%
  read_html() %>%
  html_nodes("body") %>%
  xml_find_all(
    "//div[contains(@class, 'spf-article-card--tabular')]"
  ) %>% 
  html_text() %>% 
  str_split("\r\n") %>% 
  .[[4]] %>% 
  str_trim() %>% 
  setdiff(y = "") %>% 
  str_remove_all("[,*]") %>%
  matrix(ncol = 2, byrow = TRUE) %>%
  set_colnames(c("Cases", "Total")) %>%
  as.data.frame(stringsAsFactors = FALSE) %>%
  mutate(Total = as.numeric(Total))
actTests <- actTable %>% 
  dplyr::filter(!str_detect(Cases, "recovered")) %>%
  .[["Total"]] %>%
  sum()
actRecov <- actTable %>%
  dplyr::filter(str_detect(Cases, "recov")) %>%
  .[["Total"]]
latestAU$ACT <- tibble(
  State = "Australian Capital Territory",
  date = dt,
  confirmed = actTable %>% 
    dplyr::filter(str_detect(Cases, "Confirmed cases")) %>% 
    .[["Total"]],
  deaths = 3, # Only able to be added manually. Thanks SA Health
  recovered = actRecov,
  tested = actTests
)
tasUrl <- "https://www.coronavirus.tas.gov.au/facts/cases-and-testing-updates"
tasTables <- tasUrl %>%
  read_html() %>%
  html_nodes("body") %>%
  xml_find_all("//table") %>%
  html_table() %>%
  lapply(mutate, Number = str_remove_all(Number, "[^0-9]")) %>%
  lapply(mutate, Number  = as.numeric(Number))
latestAU$TAS <- tibble(
  State = "Tasmania",
  date = dt,
  confirmed = dplyr::filter(
     tasTables[[1]], 
     str_detect(`Cases in Tasmania`, "Total cases")
     )$Number,
  deaths = dplyr::filter(
     tasTables[[1]], 
     str_detect(`Cases in Tasmania`, "Deaths")
     )$Number,
  recovered = dplyr::filter(
    tasTables[[1]],
    `Cases in Tasmania` == "Recovered"
    )$Number,
  tested = dplyr::filter(
    tasTables[[2]],
    `Laboratory tests` == "Total laboratory tests"
    )$Number
)
ntRecov <- ntTests <- NA_real_
ntUrl <- "https://coronavirus.nt.gov.au/"
ntTxt <- ntUrl %>%
  read_html() %>%
  html_nodes("body") %>%
  xml_find_all(
    "//div[contains(@class, 'header-widget')]"
  ) %>%
  html_text() %>%
    str_split("\r\n") %>%
    .[[1]] %>%
    str_trim() %>%
    .[. != ""]
ntTests <- ntTxt %>%
  str_subset("test") %>%
  str_extract("([0-9,]+)") %>%
  str_remove_all(",") %>%
  as.numeric()
ntRecov <- ntTxt %>%
  str_subset("recovered") %>%
  str_extract("[0-9,]+") %>%
  str_remove_all(",") %>%
  as.numeric()
latestAU$NT <- tibble(
  State = "Northern Territory",
  date = dt,
  confirmed = ntTxt %>%
    str_subset("confirmed") %>% 
    str_extract("[0-9]+") %>% 
    as.numeric(),
  deaths = 0,
  recovered = ntRecov,
  tested = ntTests
)
latestAU %<>% 
  bind_rows() %>%
  mutate(Country = "Australia")
latestAU %>%
  dplyr::select(
    `Province/State` = State, Country, date, recovered
  ) %>%
  write_tsv(
    here::here(glue("recovered/recovered_{dt}.tsv"))
  )
confirmed %<>%
  bind_rows(
    dplyr::select(latestAU, any_of(colnames(.)), `Province/State` = State)
  ) %>%
  arrange(date) %>%
  mutate(f = paste(`Province/State`, Country, sep = "_")) %>%
  split(f = .$f) %>%
  lapply(mutate, confirmed = cummax(confirmed)) %>%
  bind_rows() %>%
  dplyr::select(-f) %>%
  dplyr::distinct(`Province/State`, Country, date, .keep_all = TRUE)
auRecTS <- list.files(here::here("recovered"), pattern = "rec.*tsv",full.names = TRUE) %>%
  lapply(read_tsv) %>%
  bind_rows()
recovered <- url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv") %>%
  read_csv() %>%
  pivot_longer(
    cols = ends_with("20"),
    names_to = "date",
    values_to = "recovered"
  )  %>%
  mutate(
    date = str_replace_all(
      date, "(.+)/(.+)/(.+)", "20\\3-\\1-\\2"
    ) %>%
      ymd()
  ) %>%
  dplyr::rename(
    Country = `Country/Region`
  ) %>%
  dplyr::mutate(
    Country = case_when(
      `Province/State` == "Hubei" ~ "China (Hubei)",
      `Province/State` == "Hong Kong" ~ "Hong Kong",
      grepl("China", Country) & !`Province/State` %in% c("Hubei", "Hong Kong") ~ "China (Other)",
      grepl("Korea, South", Country) ~ "South Korea",
      !grepl("China", Country) ~ Country
    )
  ) %>%
  dplyr::select(-Lat, -Long) %>%
  bind_rows(auRecTS) %>%
  group_by(
    `Province/State`, Country, date
  ) %>%
  summarise_at(vars(recovered), max)
deaths <- url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv") %>%
  read_csv() %>%
  pivot_longer(
    cols = ends_with("20"),
    names_to = "date",
    values_to = "deaths"
  )  %>%
  mutate(
    date = str_replace_all(
      date, "(.+)/(.+)/(.+)", "20\\3-\\1-\\2"
    ) %>%
      ymd()
  ) %>%
  dplyr::rename(
    Country = `Country/Region`
  ) %>%
  dplyr::mutate(
    Country = case_when(
      `Province/State` == "Hubei" ~ "China (Hubei)",
      `Province/State` == "Hong Kong" ~ "Hong Kong",
      grepl("China", Country) & !`Province/State` %in% c("Hubei", "Hong Kong") ~ "China (Other)",
      Country == "Korea, South" ~ "South Korea",
      !grepl("China", Country) ~ Country
    )
  ) %>%
  dplyr::select(-Lat, -Long) %>%
  bind_rows(
    dplyr::select(latestAU, any_of(colnames(.)), `Province/State` = State)
  ) %>%
  arrange(date) %>%
  mutate(f = paste(`Province/State`, Country, sep = "_")) %>%
  split(f = .$f) %>%
  lapply(mutate, deaths = cummax(deaths)) %>%
  bind_rows() %>%
  dplyr::select(-f) %>%
  dplyr::distinct(`Province/State`, Country, date, .keep_all = TRUE)
wikiPops <- read_tsv("wikiPops.tsv")
alwaysShow <- c("Australia", "New Zealand", "US", "United Kingdom", "Taiwan*", "Singapore", "South Korea", "China (Other)", "China (Hubei)", "Hong Kong", "Japan", "Ireland")

Data for confirmed cases, recoveries and fatalities was primarily sourced from Johns Hopkins University (https://coronavirus.jhu.edu/), using the datasets provided at https://github.com/CSSEGISandData/COVID-19. JHU data is updated every 24 hours at approximately 23:59(UTC), which is about 9:30AM in Adelaide.

Live hourly updates for Australia are available from https://covid-19-au.github.io/ for those who would like an up to the minute breakdown of confirmed cases. Numbers used for generation of this page are updated periodically throughout the day using values provided by individual states at NSW, QLD, VIC, WA, SA, TAS, ACT and the NT.

Population sizes were obtained from 2019 UN Estimates. Given the disparity of infection within China, China was broken into Hubei Province and the rest of China, with Hong Kong and Taiwan already being considered as separate countries in all datasets. Population estimates for Hubei Province were taken from the 2018 estimates given by Statista.com and this is likely to be a very slight underestimate.

Confirmed cases of COVID-19 as provided by the Chinese Government have been discussed elsewhere as unusual, and data appears potentially unreliable. In this analysis, discussions regarding accurate Chinese reporting are not considered further and data is simply presented as supplied by JHU.

However, all countries are likely to contain many unreported cases given the incomplete testing regimes in place for most countries. Similarly, reporting in many countries may have features that cause concerns regarding data integrity and this makes comparison across countries difficult. Information on recovered cases has been difficult to accurately obtain due inconsistent methods for considering a case as recovered, and lack of reporting for these cases in many jurisdictions. In Australia, some states (e.g. NSW & QLD) have only begun releasing these figures in mid-April, whilst other states such as Victoria were releasing these numbers from mid-March.

International Data

startingPoint <- 4
minPop <- 5e6

For this section, most data is presented relative to population size. Growth in infection rates is only shown after the point at which the cumulative confirmed infection rate breached 4 confirmed cases / million. This equates to about 101 confirmed cases within Australia, and is broadly comparable to the “Days since passing 100 confirmed cases” commonly shown elsewhere.

Table of Most Impacted Countries

Confirmed cases in this table are effectively the cumulative, confirmed incidence rate. Recovered patients and those who have passed away are still included in these numbers. At the time of report preparation, the total number of confirmed cases, including all countries for which data has been made available, is 2,810,837.

confirmed %>%
  group_by(Country, date) %>%
  summarise(confirmed = sum(confirmed)) %>%
  ungroup() %>%
  group_by(Country) %>%
  dplyr::filter(
    date == max(date),
  ) %>%
  ungroup() %>%
  inner_join(wikiPops) %>%
  mutate(
    rate = 1e6*confirmed / Population,
    occurrence = Population / confirmed,
    Population = round(Population, -3) / 1e6
  ) %>%
  dplyr::filter(rate >= 1) %>%
  arrange(desc(rate)) %>%
  rename_at(vars(everything()), str_to_title) %>%
  dplyr::select(
    Continent, Region, Country,
    Date, Confirmed, 
    `Population (millions)` = Population, 
    `Rate (Cases per Million)` = Rate,
    Occurrence
  ) %>%
  datatable(
    options = list(
      pageLength = 25, 
      autoWidth = TRUE,
      searchCols = list(
        NULL, NULL, NULL, NULL, NULL,
        list(
          search = glue(
            '{minPop/1e6 + 0.001} ... {max(wikiPops$Population/1e6)}'
          )
        ),
        NULL
      )
    ),
    filter = 'top',
    class = "stripe",
    rownames = FALSE,
    caption = paste(
      "The most impacted countries studied here and shown as a proportion of total population.",
      "The initial filter is set so that only countries with a population greater than", comma(minPop), "are shown.",
      "All fields are searchable and sortable.",
      "To filter numeric columns, either use the slider or enter the values in the form 'min ... max'.",
      "To filter text columns, partial matching used in a case-insensitive manner.",
      "Populations have been rounded to the nearest thousand to make reading values easier.",
      "'Rate' represents the latest confirmed infection rate as cumulative cases per million people, whilst 'Occurrence' represents the number of people expected before one case is found, assuming an even distribution amongst the population.",
      "In other words, one in every 'Occurrence' people within the population have been confirmed to have contracted COVID-19.",
      "Occurrence is inversely proportional to Rate.",
      "No adjustment has been made in this table for patients who have recovered or passed away.",
      "Whilst the virus spreads with no regard to population size, the rate as shown here indicates the degree of stress which each country's health-care system is likely to be experiencing.",
      "Several countries shown here have not attracted much media attention due lower case numbers than China and Italy, but are likely to be experiencing significant duress.",
      "Continent and Region information is as provided by the UN classifications."
    )
  ) %>%
  formatCurrency(
    columns = c("Population (millions)"),
    currency = "", 
    digits = 3,
    mark = ","
  ) %>%
  formatCurrency(
    columns = c("Confirmed", "Rate (Cases per Million)", "Occurrence"),
    currency = "", 
    digits = 0,
    mark = ","
  )

Daily Increase Vs Confirmed Cases

An alternate viewpoint on the data is to remove time and inspect the relationship between the daily increase in cases and the total number of cases. When this relationship ceases it’s near linear relationship, this can be a sign the control measures have begun to take effect. Whilst this relationship appears to have broken down for Australia, no breakdown has yet occurred for countries such as the Singapore, Canada, the UK, USA and Turkey, with these countries explicitly still in the exponential growth phase.

sma <- function(x, n = 7){
    
    if (length(x) < n) return(rep(NA, length(x)))
    ind <- vapply(seq(1, length(x) - (n - 1)), seq, numeric(n), length.out = n)
    c(
        rep(NA, n - 1),
        colMeans(matrix(x[ind], nrow = n), na.rm = TRUE)
    )
}
minPop <- 10e6
minDays <- 15
minRate <- 9
plotIncVConf <- ggplotly(
confirmed %>%
  group_by(Country, date) %>%
  summarise_at(vars(confirmed), sum) %>%
  dplyr::filter(confirmed > 0) %>%
  inner_join(
    dplyr::filter(wikiPops, Population > minPop | Country %in% alwaysShow)
  ) %>%
  mutate(
    rate = 1e6 * confirmed / Population
  ) %>%
  split(f = .$Country) %>% 
  lapply(function(x, n = 7){
    x %>%
      mutate(
        d = c(0, diff(rate))
      ) %>%
      mutate_at(
        vars(d), sma, n = n
      ) %>%
      dplyr::filter(
        !is.na(d),
        rate > minRate
      ) %>%
      mutate(
        nDays = nrow(.)
      )
  }
  ) %>%
  bind_rows() %>%
  ungroup() %>%
  dplyr::filter(
    nDays > minDays,
    d > 0.01
  ) %>%
  arrange(date) %>%
  mutate(
    Country = fct_inorder(Country),
    rate = round(rate, 2),
    d = round(d, 3),
    Population = comma(round(Population, -3))
  ) %>%
  rename(
    Rate = rate,
    `Average Daily Increase` = d,
    Date = date
  ) %>%
  ggplot(
    aes(Rate, `Average Daily Increase`, colour = Country, label = Population, key = Date)
  ) +
  geom_hline(
    aes(yintercept = `Average Daily Increase`),
    data = . %>%
      dplyr::filter(Country == "Australia", Date == max(Date)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_vline(
    aes(xintercept = Rate),
    data = . %>%
      dplyr::filter(Country == "Australia", Date == max(Date)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
    geom_point() +
  geom_line() +
  scale_y_log10(
    name = "Average Daily Increase (Cases / Million)",
    label = label_comma(accuracy = 0.01)) +
  scale_x_log10(
    name = "Cumulative Confirmed Infection Rate (Cases / Million)"
  )
)
capIncVConf <- glue(
  "*Daily Increase in Cases plotted against Confirmed Cases, using confirmed cases / million.
  These two values are directly proportional until interventions are successful, at which point the proportional relationship changes, as evidenced by a sudden downwards turn.
  Scaling by population aids in the visualisation of where in the relative infection trajectory each country's control measures have begun to take effect.
  Daily increases are shown using a 7-day moving average in order to minimise the impact of day-to-day variation.
  Countries are only shown from the point at which the moving average exceeds {minRate} cases/million, and have exceeded this value for > {minDays} days.
  Data is additionally restricted to countries with a population > {comma(minPop)}, for greater clarity, with the addition of Singapore.
  Australia's position is indicated by the intersection of the dashed blue lines.*
  "
)
ggplotly(plotIncVConf)

Daily Increase in Cases plotted against Confirmed Cases, using confirmed cases / million. These two values are directly proportional until interventions are successful, at which point the proportional relationship changes, as evidenced by a sudden downwards turn. Scaling by population aids in the visualisation of where in the relative infection trajectory each country’s control measures have begun to take effect. Daily increases are shown using a 7-day moving average in order to minimise the impact of day-to-day variation. Countries are only shown from the point at which the moving average exceeds 9 cases/million, and have exceeded this value for > 15 days. Data is additionally restricted to countries with a population > 10,000,000, for greater clarity, with the addition of Singapore. Australia’s position is indicated by the intersection of the dashed blue lines.

Cumulative Incidence Rates

ausDays <- confirmed %>% 
  dplyr::filter(Country == "Australia") %>%
  group_by(Country, date) %>%
  summarise_at(vars(confirmed), sum) %>%
  left_join(wikiPops) %>%
  mutate(rate = 1e6 * confirmed / Population) %>%
  dplyr::filter(rate > startingPoint) %>%
  nrow()
minDays <- ausDays - 5
# Use Singapore as that has the longest dataset besides Hubei
nDays <- confirmed %>%
  dplyr::filter(Country == "Singapore") %>% 
  group_by(Country, date) %>%
  summarise(confirmed = sum(confirmed)) %>%
  ungroup() %>%
  left_join(wikiPops) %>%
  mutate(
    rate = 1e6*confirmed / Population
  ) %>% 
  dplyr::filter(rate > startingPoint) %>% 
  nrow() %>%
  subtract(1)
refRate <- c(2, 4, 8)
minPop <- 10e6
p <- confirmed %>%
  group_by(Country, date) %>%
  summarise(confirmed = sum(confirmed)) %>%
  ungroup() %>%
  inner_join(
    dplyr::filter(wikiPops, Population > minPop | Country %in% alwaysShow)
  ) %>%
  mutate(
    rate = 1e6*confirmed / Population
  ) %>%
  dplyr::filter(
    rate > startingPoint
  ) %>%
  group_by(Country) %>%
  mutate(
    days = date - min(date)
  ) %>%
  dplyr::filter(
    max(days) >= minDays | Country %in% alwaysShow
  ) %>%
  ungroup() %>%
  mutate(
    days = as.integer(days),
    rate = round(rate, 2)
  ) %>%
  dplyr::filter(days <= nDays | Country %in% alwaysShow) %>%
  arrange(date) %>%
  mutate(Country = fct_inorder(Country)) %>%
  rename_all(str_to_title) %>%
  ggplot(
    aes(Days, Rate, colour = Country, Date = Date, Confirmed = Confirmed)
  ) +
  geom_segment(
    aes(x, y, xend = xmax, yend = ymax),
    data = tibble(
      x = 0,
      y = startingPoint,
      xmax = c(16, 32, nDays ),
      ymax = startingPoint*2^(xmax / refRate)
    ),
    inherit.aes = FALSE,
    colour = "grey70",
    linetype = 2
  ) +
  geom_text(
    aes(xmax, ymax, label = label),
    data = tibble(
      xmax = c(16, 32, nDays - 2),
      ymax = startingPoint*2^(xmax / refRate),
      label = glue("Doubling in\n {refRate} days")
    ),
    colour = "grey70",
    inherit.aes = FALSE
  ) +
  geom_vline(
    aes(xintercept = Days),
    data = . %>%
      dplyr::filter(Country == "Australia") %>%
      dplyr::filter(Days == max(Days)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_hline(
    aes(yintercept = Rate),
    data = . %>%
      dplyr::filter(Country == "Australia") %>%
      dplyr::filter(Days == max(Days)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_point() +
  geom_line() +
  scale_x_continuous(
    expand = expansion(mult = c(0, 0.05)),
  ) +
  scale_y_log10(
    expand = expansion(mult = c(0, 0.05))
  ) +
  xlab(
    paste(
      "Days since passing", 
      startingPoint,
      "confirmed cases/million"
    )
  ) +
  ylab("Confirmed Cumulative Infection Rate (cases/million)")
ggplotly(
  p, 
  tooltip = c(
    "Days", "Rate", "Country", "Date", "Confirmed"
  )) 

COVID-19 Confirmed Cumulative Infection Rate for countries which have exceeded 4 confirmed cases/million for 42 or more days, and with populations greater than 10,000,000, apart from a small number of specifically included countries. Data is only shown for the first 80 calendar days since passing 4 confirmed cases/million. Note that from the day records begin in this dataset (2020-01-22), the confirmed infection rate in Hubei was 7.5 confirmed cases/million. The blue dashed line indicates Australia’s current position on this figure. Diagonal grey lines indicate a doubling in the infection rate every 2, 4, or 8 days. To hide a country, click on the country in the plot legend. Clicking again on the country in the legend will restore the data within the plot. Countries are shown in order of the date at which they passed the 4 confirmed case/million mark. Due to the large number of countries shown, you may need to scroll through the legend. Regions of the plot are also able to be zoomed interactively. Please note the y-axis is shown on the logarithmic scale, so that a series of points which appear to be diagonal will indicate exponential growth. The flatter the line, the slower the growth and a perfectly horizontal line would indicate zero growth, or no new confirmed cases.

All figures and tables presented here simply aim to show an alternative viewpoint on the data. Every way to view COVID-19 data will mask important features, and the values shown here do not take into account vital factors such as population density, variability of infection across regions within countries, social culture and demographics. Many countries may not be directly comparable for a combination of the above factors. It is simply to view the data through the lens of a country’s population size using a value which should be easily interpretable.

In the above plot:

  • Only countries are shown where the infection rate has surpassed 4 cases/million for more than 42 days. This choice seemed to be a useful way of enabling the comparison of COVID-19 progression across countries after scaling for population size

Currently Active Infection Rates

As an alternative viewpoint, the numbers of recovered and deceased patients have been removed from the numbers of confirmed infections plot to provide an estimate of the currently active infections. Importantly, information regarding recovered cases is likely to be the least reliable of reported values as many regions do not report updated numbers for several consecutive days. Additionally many regions do not report recovered cases as the criteria for considering a person to have recovered as currently unclear.

p2 <- confirmed %>%
  dplyr::filter(
    confirmed > 0,
    Country != "China (Other)"
  ) %>%
  left_join(deaths) %>%
  group_by(Country, date) %>%
  summarise_at(
    vars(confirmed, deaths), sum
  ) %>%
  left_join(
    recovered %>%
      group_by(Country, date) %>%
      summarise_at(vars(recovered), sum)
  ) %>%
  mutate_at(vars(confirmed, deaths, recovered), cummax) %>%
  mutate(
    active = confirmed - deaths - recovered
  ) %>%
  dplyr::filter(!is.na(active)) %>%
  inner_join(
    dplyr::filter(wikiPops, Population > minPop | Country %in% alwaysShow)
  ) %>%
  mutate(rate = 1e6 * active / Population) %>%
  dplyr::filter(rate > startingPoint) %>%
  group_by(Country) %>%
  mutate(
    days = date - min(date)
  ) %>%
  dplyr::filter(max(days) > minDays | Country %in% alwaysShow) %>%
  ungroup() %>%
  mutate(
    days = as.integer(days),
    rate = round(rate, 2)
  ) %>%
  dplyr::filter(days <= nDays) %>%
  arrange(date) %>%
  mutate(Country = fct_inorder(Country)) %>%
  rename_all(str_to_title) %>%
  ggplot(
    aes(
      x = Days, y = Rate, colour = Country, 
      Date = Date, Active = Active,
      Confirmed = Confirmed, Recovered = Recovered,
      Deaths = Deaths
      )
  ) +
  geom_segment(
    aes(x, y, xend = xmax, yend = ymax),
    data = tibble(
      x = 0,
      y = startingPoint,
      xmax = c(21, 42, nDays),
      ymax = startingPoint + 2^(xmax/ c(2, 4, 8))
    ),
    inherit.aes = FALSE,
    colour = "grey70",
    linetype = 2
  ) + 
  geom_text(
        aes(xmax, ymax, label = label),
    data = tibble(
      xmax = c(21, 42, nDays),
      rate = c(2, 4, 8),
      ymax = 2^(xmax/ rate),
      label = glue("Double in\n {rate} days")
    ),
    colour = "grey70",
    inherit.aes = FALSE
  ) +
  geom_vline(
    aes(xintercept = Days),
    data = . %>%
      dplyr::filter(Country == "Australia") %>%
      dplyr::filter(Days == max(Days)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_hline(
    aes(yintercept = Rate),
    data = . %>%
      dplyr::filter(Country == "Australia") %>%
      dplyr::filter(Days == max(Days)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_point() +
  geom_line() +
  scale_x_continuous(
    expand = expansion(mult = c(0, 0.05)),
  ) +
  scale_y_log10(
    expand = expansion(mult = c(0, 0.05))
  ) +
  xlab(
    paste(
      "Days since passing", 
      startingPoint, 
      "confirmed active cases/million"
    )
  ) +
  ylab("Confirmed Active Infection Rate (cases/million)")
ggplotly(p2) 

Confirmed active cases of COVID-19 for countries where the confirmed infection rate has exceeded 4 confirmed active cases/million for more than 80 calendar days. Only countries with a population greater than 10,000,000 are shown for better visualisation. Due to difficulties introduced by the currently reported low active infection rate outside Hubei province, data from China has been excluded from this plot, with the exception of Hubei and Hong Kong. Recovered cases as poorly and irregularly reported by many countries and as such, this plot may be suffer multiple instances of a sudden decline. The blue dashed lines indicates Australia’s current position on this figure. To hide a country, click on the country in the plot legend. Clicking again on the country in the legend will restore the data within the plot. Countries are shown in order of the date at which they passed the 4 confirmed active case/million mark. Due to the number of countries shown, you may need to scroll through the legend. Regions of the plot are also able to be zoomed interactively. Please note the y-axis is shown on the logarithmic scale, so that a series of points which appear to be diagonal will indicate exponential growth/decay. Given the different starting point to the previous plot, data will generally be shown for fewer time-points.

Notable features of this perspective are:

  • Active infection rates in many countries are currently declining
  • Active infection rates appear to be declining in Australia, Hong Kong, Iran and Switzerland
  • Despite strict control measures early on in the pandemic, active infection rates are once again exponentially increasing in Singapore

Comparison of Total Fatalities

minPop <- 4e6
minDeaths <- 500
nCnt <- 25
a <- deaths %>%
  dplyr::filter(deaths > 0) %>%
  group_by(Country, date) %>%
  summarise_at(vars(deaths), sum) %>%
  left_join(wikiPops) %>%
  mutate(rate = 1e6*deaths / Population) %>%
  dplyr::filter(
    Population > minPop,
    rate == max(rate),
    deaths > minDeaths
  ) %>%
  arrange(rate, desc(date)) %>%
  ungroup() %>%
  mutate(Country = fct_inorder(Country)) %>%
  arrange(desc(rate)) %>%
  distinct(Country, .keep_all = TRUE) %>%
  droplevels() %>%
  dplyr::slice(1:nCnt) %>%
  ggplot(aes(Country, rate, fill = Country)) +
  geom_col(colour = "black") +
  geom_label(
    aes(y = rate + 30, label = round(rate, 0)),
    fill = "white", alpha = 0.5
  ) +
  scale_fill_viridis_d(option = "magma", direction = 1) +
  scale_y_continuous(expand = expansion(c(0, 0.08))) +
  labs(y = "Fatalities / Million") +
  coord_flip() +
  facet_grid(Continent~., scales = "free_y", space = "free_y") +
  theme(legend.position = "none")
b <- deaths %>%
  dplyr::filter(deaths > 0) %>%
  group_by(Country, date) %>%
  summarise_at(vars(deaths), sum) %>%
  left_join(wikiPops) %>%
  mutate(rate = 1e6*deaths / Population) %>%
  dplyr::filter(
    Population > minPop,
    rate == max(rate),
    deaths > minDeaths
  ) %>%
  arrange(rate, desc(date)) %>%
  ungroup() %>%
  mutate(Country = fct_inorder(Country)) %>%
  arrange(desc(rate)) %>%
  distinct(Country, .keep_all = TRUE) %>%
  droplevels() %>%
  dplyr::slice(1:nCnt) %>%
  ggplot(aes(Country, deaths, fill = Country)) +
  geom_col(colour = "black") +
  geom_label(
    aes(y = deaths + 4500, label = comma(deaths, accuracy = 1)),
    fill = "white", alpha = 0.5
  ) +
  scale_fill_viridis_d(option = "magma", direction = 1) +
  scale_y_continuous(expand = expansion(c(0, 0.08)), labels = comma) +
  labs(y = "Total Fatalities") +
  coord_flip() +
  facet_grid(Continent~., scales= "free_y", space = "free_y") +
  theme(legend.position = "none")
cp <- glue(
  "*Comparison of the {nCnt} most impacted countries by fatalities per million showing A) the number of fatalities scaled by population size and B) Total Fatalities.
  For fatality rates presented scaled by population size, it is important to realise that a value of 500 indicates that one in every 2000 people from the total population has died (ignoring demographics).
  Similarly, a value of 100 indicates that 1 in every 10,000 from the total population has died in that country.
  Whilst the US currently has the most fatalities, 1 in every {comma(round(1e6 / dplyr::filter(a$data, Country == 'US')$rate, 0))} from the US population have currently been confirmed to have died from COVID19.
  For {as.character(a$data$Country[which.max(a$data$rate)])}, 1 in every {comma(round(1e6 / max(a$data$rate), 0))} people have died directly from COVID 19.
  Only countries with populations > {comma(minPop)}, and with more than {minDeaths} fatalities are shown.
  Countries are also grouped by their continent as designated by the UN classifications*"
)
plot_grid(
  a + theme(legend.position = "none"),
  b + 
    theme(
      legend.position = "none",
      axis.title.y = element_blank()
    ),
  labels = c("A", "B"),
  nrow = 1
)
*Comparison of the 25 most impacted countries by fatalities per million showing A) the number of fatalities scaled by population size and B) Total Fatalities.
For fatality rates presented scaled by population size, it is important to realise that a value of 500 indicates that one in every 2000 people from the total population has died (ignoring demographics).
Similarly, a value of 100 indicates that 1 in every 10,000 from the total population has died in that country.
Whilst the US currently has the most fatalities, 1 in every 6,334 from the US population have currently been confirmed to have died from COVID19.
For Belgium, 1 in every 1,728 people have died directly from COVID 19.
Only countries with populations > 4,000,000, and with more than 500 fatalities are shown.
Countries are also grouped by their continent as designated by the UN classifications*

Comparison of the 25 most impacted countries by fatalities per million showing A) the number of fatalities scaled by population size and B) Total Fatalities. For fatality rates presented scaled by population size, it is important to realise that a value of 500 indicates that one in every 2000 people from the total population has died (ignoring demographics). Similarly, a value of 100 indicates that 1 in every 10,000 from the total population has died in that country. Whilst the US currently has the most fatalities, 1 in every 6,334 from the US population have currently been confirmed to have died from COVID19. For Belgium, 1 in every 1,728 people have died directly from COVID 19. Only countries with populations > 4,000,000, and with more than 500 fatalities are shown. Countries are also grouped by their continent as designated by the UN classifications

Table of Fatalities

minPop <- 4e6
minCases <- 1000
deaths %>%
  left_join(confirmed) %>%
  group_by(Country, date) %>%
  summarise_at(vars(deaths, confirmed), sum) %>%
  dplyr::filter(deaths > 0) %>%
  left_join(wikiPops) %>%
  ungroup() %>%
  mutate(
    infectionRate = round(1e6 * confirmed / Population, 1),
    fatalityRate = deaths / confirmed,
    Population = round(Population / 1e6, 3)
  ) %>%
  arrange(desc(date), fatalityRate) %>%
  distinct(Country, .keep_all = TRUE) %>%
  arrange(fatalityRate) %>%
  dplyr::select(
    Continent, Region, Country, Population,
    `Confirmed Cases` = confirmed, 
    `Infection Rate (per million)` = infectionRate,
    `Fatalities` = deaths, 
    `Fatality Rate` = fatalityRate
  ) %>%
  datatable(
    options = list(
      pageLength = 25, 
      autoWidth = TRUE,
      searchCols = list(
        NULL, NULL, NULL, 
        list(
          search = glue(
            '{minPop/1e6} ... {max(wikiPops$Population/1e6)}'
          )
        ),
        list(
          search = glue(
            '{minCases} ... {max(.$`Confirmed Cases`)}'
          )
        ),
        NULL, NULL, NULL
      )
    ),
    filter = 'top',
    class = "stripe",
    rownames = FALSE,
    caption = glue(
      "All countries with recorded fatalities, sorted by default in increasing order of the Fatality Rate.
      The Fatality Rate simply indicates the number of confirmed cases which end in a fatality.
      The Infection Rate represents the cumulative number of cases confirmed within the country, per million people.
      All columns are searchable and filters are set by default to exclude countries with populations below {comma(minPop)} and those with fewer than {minCases} confirmed cases."
    )
  ) %>%
  formatRound(
    columns = "Infection Rate (per million)",
    mark = ",",
    digits = 1
  ) %>%
  formatPercentage(
    columns = "Fatality Rate",
    digits = 2
  )

Dimensional Reduction

In order to summarise which countries are the most similar to each other, a Principal Component Analysis was performed. This enables the multi-dimensional data of the above plots to summarised in two dimensions. As missing data cannot be included in this analysis, several countries which are at earlier comparative time-points than Australia were omitted from this analysis.

nDays <- confirmed %>% 
  dplyr::filter(Country == "Australia") %>% 
  group_by(Country, date) %>%
  summarise(confirmed = sum(confirmed)) %>%
  ungroup() %>%
  inner_join(wikiPops) %>%
  mutate(
    rate = 1e6*confirmed / Population
  ) %>% 
  dplyr::filter(rate > startingPoint) %>% 
  nrow() %>%
  subtract(4)
minPop <- 4e6
pca <- confirmed %>%
  group_by(Country, date) %>%
  summarise(confirmed = sum(confirmed)) %>%
  ungroup() %>%
  inner_join(wikiPops) %>%
  mutate(
    rate = 1e6*confirmed / Population
  ) %>%
  dplyr::filter(
    rate > startingPoint,
    Population > minPop | Country %in% alwaysShow
  ) %>%
  group_by(Country) %>%
  mutate(
    days = date - min(date)
  ) %>%
  dplyr::filter(
    max(days) >= nDays
  ) %>%
  ungroup() %>%
  mutate(
    days = as.integer(days),
    rate = round(rate, 2)
  ) %>%
  dplyr::filter(days <= nDays) %>%
  dplyr::select(Country, rate, days) %>%
  pivot_wider(
    values_from = rate,
    names_from = days
  ) %>%
  as.data.frame() %>%
  column_to_rownames("Country") %>%
  as.matrix() %>%
  .[!rowAnyNAs(.),] %>%
  log() %>%
  prcomp() 
set.seed(101)
pca$x %>%
  as.data.frame() %>%
  rownames_to_column("Country") %>%
  mutate(
    Cluster = cbind(PC1, PC2) %>%
      apply(2, function(x){
        (x - mean(x)) / sd(x)
      }) %>%
      kmeans(centers = k, nstart = 10) %>%
      .[["cluster"]] %>%
      as.factor()
  ) %>%
  ggplot(aes(PC1, PC2, label = Country, colour = Cluster)) +
  geom_point() +
  geom_text_repel(
    show.legend = FALSE
  ) +
  stat_ellipse(
    aes(fill = Cluster),
    colour = NA,
    geom = "polygon",
    alpha = 0.1
    ) +
  xlab(
    paste0(
      "PC1 (", pcPercent[['PC1']],")"
      )
  ) +
    ylab(
    paste0(
      "PC2 (", pcPercent[['PC2']], ")"
      )
  ) +
  theme(
    legend.position = "none"
  )
*Dimensional reduction showing which countries infection rates are the most similar to each other __at the 43 day time point, after passing 4 confirmed cases/million__.
This may or may not be indicative of future spread within the population.
The value 43 days was simply chosen as this marks how long since the US passed this threshold.
Countries which have not progressed beyond 4 confirmed cases/million for 43 days or more are not shown.
Countries with populations < 4,000,000 are also excluded.
Clusters were generated using k-means, manually specifying k = 4 and are not definitive.
Principal Component 1, on the x-axis, corresponds to the greatest source of variability within the data 
(94.4%), and countries which appear near each other along this axis can be assumed to be showing similar growth in confirmed infection rates at this time point.
Separation along the y-axis is less significant, but also worthy of note, as this represents 3.9% of variability within the data.*

Dimensional reduction showing which countries infection rates are the most similar to each other at the 43 day time point, after passing 4 confirmed cases/million. This may or may not be indicative of future spread within the population. The value 43 days was simply chosen as this marks how long since the US passed this threshold. Countries which have not progressed beyond 4 confirmed cases/million for 43 days or more are not shown. Countries with populations < 4,000,000 are also excluded. Clusters were generated using k-means, manually specifying k = 4 and are not definitive. Principal Component 1, on the x-axis, corresponds to the greatest source of variability within the data (94.4%), and countries which appear near each other along this axis can be assumed to be showing similar growth in confirmed infection rates at this time point. Separation along the y-axis is less significant, but also worthy of note, as this represents 3.9% of variability within the data.

Fatality, Recovery and Active Infection Rates

All rates presented in this section do not take population size into account, but simply look at the rates of recovery and fatalities within each country’s cohort.

Summary of All Rates

fr <- confirmed %>%
  inner_join(deaths) %>%
  group_by(Country) %>%
  dplyr::filter(date == max(date)) %>%
  ungroup() %>%
  summarise(fr = sum(deaths) / sum(confirmed)) %>%
  .[["fr"]]
rr <- confirmed %>%
  group_by(Country, date) %>%
  summarise_at(vars(confirmed), sum, na.rm = TRUE) %>%
  left_join(
    recovered %>% 
      group_by(Country, date) %>%
      summarise_at(vars(recovered), sum, na.rm = TRUE)
  ) %>%
  dplyr::filter(date == max(date)) %>%
  ungroup() %>%
  summarise(rr = sum(recovered) / sum(confirmed)) %>%
  .[["rr"]]

Summarising all available data from all countries:

  • The current fatality rate from all confirmed cases is 7.0%. This may be a function of under-reporting of true cases and is very likely to be an overestimate.
  • The current recovery rate from all confirmed cases is 28.1%. Given the level of under-reporting this may also be highly inaccurate.
  • At this point, 64.9% of all confirmed infections are considered as ‘active’.
minPop <- 10e6
p4 <- confirmed %>%
  dplyr::filter(
    confirmed > 0
  ) %>%
  left_join(deaths) %>%
  group_by(Country, date) %>%
  summarise_at(vars(confirmed, deaths), sum) %>%
  left_join(
    recovered %>%
      group_by(Country, date) %>%
      summarise_at(vars(recovered), sum)
  ) %>%
  ungroup() %>%
  inner_join(
    wikiPops %>% dplyr::filter(Population > minPop | Country %in% alwaysShow)
  ) %>%
  mutate(rate = 1e6 * confirmed / Population) %>%
  dplyr::filter(rate > startingPoint) %>% 
  arrange(date)%>% 
  mutate(Country = fct_inorder(Country)) %>%
  group_by(Country) %>%
  dplyr::filter(date == max(date)) %>%
  mutate(
    active = confirmed - recovered - deaths
  ) %>%
  mutate(
    active = 100*active / confirmed,
    recovered = 100*recovered / confirmed,
    fatalities = 100*deaths / confirmed
  ) %>%
  dplyr::filter(active < 100) %>%
  pivot_longer(
    cols = c(active, recovered, fatalities),
    names_to = "Status",
    values_to = "Percentage"
  ) %>%
  mutate(
    Status = str_to_title(Status),
    Status = factor(
      Status, 
      levels = c("Active", "Recovered", "Fatalities")
    ),
    Percentage = round(Percentage, 2)
  ) %>%
  mutate(confirmed = comma(confirmed)) %>%
  rename(Confirmed = confirmed) %>%
  ggplot(
    aes(
      Country, Percentage,
      fill = Status, cases = Confirmed
    )
  ) +
  geom_col() +
  scale_fill_manual(
    values = c(
      Active = "blue",
      Recovered = "green",
      Fatalities = "red"
      )
  ) +
  scale_y_continuous(expand = expansion(0, 0)) +
  coord_flip() +
  labs(x = c()) +
  theme(
    legend.position = "none"
  )
ggplotly(p4)

Fatality, Recovery and Active Infection rates for countries which have exceeded 4 confirmed cases / million, and with a population size > 10,000,000. Countries are shown in order of the date they passed 4 confirmed cases / million

Fatality Rate Vs Time

minDays <- 12
minPop <- 10e6
df <- confirmed %>%
  inner_join(deaths) %>%
  group_by(Country, date) %>%
  summarise_at(
    vars(confirmed, deaths),
    sum
  ) %>%
  inner_join(
    wikiPops %>% dplyr::filter(Population > minPop | Country %in% alwaysShow)
  ) %>%
  ungroup() %>%
  mutate(
    `Infection Rate` = 1e6 * confirmed / Population
  ) %>%
  dplyr::filter(
    `Infection Rate` > startingPoint
    ) %>%
  group_by(Country) %>%
  mutate(
    Days = date - min(date),
    n = n()
  ) %>%
  dplyr::filter(
    n > minDays,
    max(deaths, na.rm = TRUE) > 0
  ) %>%
  mutate(
    Rate = deaths / confirmed,
    `Fatality Rate` = percent(Rate, accuracy = 0.1),
    minusT = date - max(date)
  ) %>%
  ungroup() 
plotFr <- mutate(
  df,
  Country = factor(
    Country, 
    levels = df %>%
      dplyr::select(Country, minusT, Rate) %>%
      pivot_wider(
        id_cols = Country, 
        names_from = minusT, 
        values_from = Rate
      ) %>% 
      as.data.frame() %>%
      column_to_rownames("Country") %>% 
      dist() %>% 
      hclust() %>% 
      as.dendrogram() %>% 
      labels()
  )
) %>% 
  rename_all(str_to_title) %>%
  ggplot(
    aes(
      x = Days, y = Country, fill = Rate,
      conf = Confirmed, 
      deaths = Deaths,
      date = Date,
      label = `Fatality Rate`
      )
  ) +
  geom_raster() +
  geom_vline(
    aes(xintercept = Days + 0.5),
    data = . %>%
      dplyr::filter(Country == "Australia") %>%
      dplyr::filter(Date == max(Date)),
    linetype = 3,
    size = 1/3,
    colour = "grey70"
  ) +
  scale_fill_viridis_c(
    option = "magma",
    breaks = seq(0, 0.15, by = 0.025)
  ) +
  scale_x_continuous(
    expand = expansion(0, 0),
    labels = abs
    ) +
  scale_y_discrete(expand = expansion(0, 0)) +
  labs(
    x = glue(
      "Days since passing {startingPoint} cases/million"
    ),
    y = c(),
    fill = "Fatality\nRate"
  ) +
  theme(
    panel.grid = element_blank()
  )
cpFr <- glue(
  "*Fatality Rate for confirmed cases after passing {startingPoint} confirmed cases / million.
  Only countries with {minDays} days of data beyond this time-point and a population size >{comma(minPop)} are shown.
  Country order is based on clustering using the most recent values.
  Countries with no recorded fatalities have been excluded for obvious reasons. 
  The dashed grey line indicates the time-point Australia is currently at.
  A clear trend of an increasing fatality rate with time is evident in many countries (e.g. Spain, France, Italy, UK), however, for some countries (e.g. Singapore) this rate appears relatively stable throughout the majority of days.
  The overall Fatality Rate for confirmed cases is currently ({percent(fr, accuracy = 0.1)}).*"
)
ggplotly(
  plotFr,
  tooltip = c(
    "Country", "Date", "Days", "Confirmed", "Deaths",
    "Fatality Rate"
    )
  )

Fatality Rate for confirmed cases after passing 4 confirmed cases / million. Only countries with 12 days of data beyond this time-point and a population size >10,000,000 are shown. Country order is based on clustering using the most recent values. Countries with no recorded fatalities have been excluded for obvious reasons. The dashed grey line indicates the time-point Australia is currently at. A clear trend of an increasing fatality rate with time is evident in many countries (e.g. Spain, France, Italy, UK), however, for some countries (e.g. Singapore) this rate appears relatively stable throughout the majority of days. The overall Fatality Rate for confirmed cases is currently (7.0%).

Unscaled Fatality Rate

minDeaths <- 10
ausDays <- deaths %>%
  dplyr::filter(Country == "Australia") %>% 
  group_by(date) %>%
  summarise_at(vars(deaths), sum) %>% 
  dplyr::filter(deaths > 10) %>%
  nrow()
minDays <- ausDays - 2
minPop <-10e6
deathPlot <- confirmed %>%
  left_join(deaths) %>%
  group_by(Country, date) %>%
  summarise_at(
    vars(confirmed, deaths),
    sum
  ) %>%
  ungroup() %>%
  dplyr::filter(
    deaths >= minDeaths,
    Country %in% c(dplyr::filter(wikiPops, Population > minPop)$Country, alwaysShow)
  ) %>%
  group_by(Country) %>%
  mutate(Days = as.integer(date - min(date))) %>%
  dplyr::filter(
    max(Days) > minDays | Country %in% alwaysShow
  ) %>%
  ungroup() %>%
  rename_all(str_to_title) %>%
  arrange(desc(Days)) %>%
  mutate(Country = fct_inorder(Country)) %>%
  ggplot(
    aes(
      x = Days, y = Deaths, colour = Country,
      label = Date, conf = Confirmed)
    ) +
    geom_vline(
    aes(xintercept = Days),
    data = . %>% 
      dplyr::filter(Country == "Australia") %>%
      slice(1),
    linetype = 3,
    colour = "blue"
  ) +
    geom_hline(
    aes(yintercept = Deaths),
    data = . %>% 
      dplyr::filter(Country == "Australia") %>%
      slice(1),
    linetype = 3,
    colour = "blue"
  ) +
  geom_point() +
  geom_line() +
  scale_y_log10() +
  labs(
    x = glue("Days Since Passing {minDeaths} Deaths"),
    y = "Cumulative Fatalities"
  )
ggplotly(deathPlot)

As with the spread of the virus, fatalities also grow at an exponential rate. Any slowing in the growth of fatalities is an accurate marker for when the spread of the virus has definitively slowed, despite being a significantly lagging marker. Cumulative fatalities are only shown for countries which have seen 10 or more fatalities for a period beyond 29 days, and for countries with a population > 10,000,000. The blue dashed lines indicate the current position of Australia on this plot.

Daily Increase in Fatalities Vs Confirmed Fatalities

minCases <- 50
minPop <- 10e6
minDays <- 7
dailyVsTotalDeaths <- deaths %>%
  group_by(Country, date) %>%
  summarise_at(vars(deaths), sum) %>%
  dplyr::filter(deaths > 0) %>%
  split(f = .$Country) %>%
  lapply(function(x, n = 7){
    x %>%
      mutate(
        d = c(0, diff(deaths))
      ) %>%
      mutate_at(
        vars(deaths, d), sma, n = n
      )
  }
  ) %>%
  bind_rows() %>%
  inner_join(wikiPops) %>%
  dplyr::filter(deaths > minCases) %>%
  mutate_at(vars(deaths, d), round, 2) %>%
  mutate(n = n()) %>%
  dplyr::filter(
    n > minDays | Country %in% alwaysShow,
    Population > minPop,
    d > 0
  ) %>%
  rename_all(str_to_title) %>%
  rename(
    `Average Daily Fatalities` = D,
    `Average Total Fatalities` = Deaths
  ) %>%
  arrange(Date) %>%
  ungroup() %>%
  mutate(
    Population = comma(round(Population, -3)),
    Country = fct_inorder(Country)
    ) %>%
  ggplot(aes(`Average Total Fatalities`, `Average Daily Fatalities`, colour = Country, label = Date, key = Population)) +
  geom_hline(
    aes(yintercept = `Average Daily Fatalities`),
    data = . %>%
      dplyr::filter(Country == "Australia", Date == max(Date)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_vline(
    aes(xintercept = `Average Total Fatalities`),
    data = . %>%
      dplyr::filter(Country == "Australia", Date == max(Date)),
    linetype = 3,
    colour = "blue",
    size = 1/3
  ) +
  geom_point() +
  geom_line() +
  scale_x_log10(labels = comma) +
  scale_y_log10(labels = comma) +
  labs(
    x = "Total Confirmed Fatalities",
    y = 'Daily Fatalities'
  )
capDailyVsTotal <- glue(
  "*Daily Fatalities plotted against Total Fatalities.
  These two values are directly proportional until interventions are successful, at which point the proportional relationship changes, as evidenced by a sudden downwards turn.
  Values shown are 7-day moving averages in order to minimise the impact of day-to-day variation.
  Countries are only shown from the point at which the moving average exceeds {minCases} cases, and have exceeded this value for > {minDays} days.
  Data is additionally restricted to countries with a population > {comma(minPop)}, for greater clarity.
  Australia's position is indicated by the intersection of the dashed blue lines.
  Importantly, __due to the time taken from the initial infection to the day of death, this is a lag indicator of the control of infection__.*
  "
)
ggplotly(dailyVsTotalDeaths)

Daily Fatalities plotted against Total Fatalities. These two values are directly proportional until interventions are successful, at which point the proportional relationship changes, as evidenced by a sudden downwards turn. Values shown are 7-day moving averages in order to minimise the impact of day-to-day variation. Countries are only shown from the point at which the moving average exceeds 50 cases, and have exceeded this value for > 7 days. Data is additionally restricted to countries with a population > 10,000,000, for greater clarity. Australia’s position is indicated by the intersection of the dashed blue lines. Importantly, due to the time taken from the initial infection to the day of death, this is a lag indicator of the control of infection.

Australian Specific Data

ausPops <- tribble(
  ~State, ~Population,
  "New South Wales",    8117976,
  "Victoria", 6629870,
   "Queensland", 5115451,
  "South Australia", 1756494,
  "Western Australia", 2630557,
  "Tasmania", 535500,
  "Northern Territory", 245562,
  "Australian Capital Territory", 428060
)

Australian State populations were taken from the ABS Website and were accurate in Sept 2019. The difference with previous estimates used above was within 0.04%, and as such no adjustments were made.

A series of complimentary charts regarding the spread of COVID-19 are available from the ABC website.

  • It is now clear that Australia is no longer in an exponential growth phase and there are clear signs of the growth levelling out.
  • Using an estimated population size of 25,203,198, the total percentage of the Australian population confirmed as having been infected currently sits at 0.027%, or one person in every 3,765.
  • Notably, the proportion of recovered cases is now >70%
confirmed %>% 
  dplyr::filter(
    Country == "Australia", 
    date >= max(date) - 1
  ) %>%
  bind_rows(
    group_by(., date) %>%
      summarise(
        confirmed = sum(confirmed)
      ) %>%
      mutate(
        `Province/State` = "Total"
      )
  ) %>%
  group_by(`Province/State`) %>%
  mutate(
    `Daily Change` = c(NA, diff(confirmed)),
    `% Change` = c(NA, diff(confirmed)) / min(confirmed)
  ) %>%
  ungroup() %>%
  pivot_wider(
    id_cols = `Province/State`, 
    names_from = date, 
    values_from = c(confirmed, `Daily Change`, `% Change`)
  ) %>%
  dplyr::select_if(function(x){sum(is.na(x)) <= 1}) %>%
  rename(State = `Province/State`) %>%
  rename_all(str_remove_all, pattern = "confirmed_") %>%
  rename_all(str_remove_all, pattern = "_2020.+") %>%
  left_join(
    dplyr::select(latestAU, State, confirmed, deaths, recovered)
  ) %>%
  mutate_at(
    vars(confirmed, deaths, recovered),
    function(x){
      x[is.na(x)] <- sum(x, na.rm = TRUE)
      x
    }
  ) %>%
  mutate(
    `% Change` = percent(`% Change`, accuracy = 0.01),
    `Fatality Rate` = percent(deaths / confirmed, accuracy = 0.1),
    `Recovery Rate` = percent(recovered / confirmed, accuracy = 0.1),
    `Currently Active` = confirmed - recovered - deaths,
  ) %>%
  rename(
    Fatalities = deaths,
    Recovered = recovered
  ) %>%
  dplyr::select(
    State, starts_with("20"), ends_with("Change"), starts_with("Fatal"), starts_with("Recov"), `Currently Active`
  ) %>%
  pander(
    justify = "lrrrrrrrrr",
    caption = paste(
      "*Confirmed cases, fatalities and recoveries reported by each state at time of preparation.",
      "Any states with unchanged, or decreasing confirmed cases may indicate delays with the automated data sources, such as health.gov.au or JHU, or that these states have not yet reported for the day.*"
    ),
    emphasize.strong.rows = nrow(.)
  )
Confirmed cases, fatalities and recoveries reported by each state at time of preparation. Any states with unchanged, or decreasing confirmed cases may indicate delays with the automated data sources, such as health.gov.au or JHU, or that these states have not yet reported for the day.
State 2020-04-24 2020-04-25 Daily Change % Change Fatalities Fatality Rate Recovered Recovery Rate Currently Active
Australian Capital Territory 105 106 1 0.95% 3 2.8% 98 92.5% 5
New South Wales 2,982 2,994 12 0.40% 35 1.2% 2,193 73.2% 766
Northern Territory 28 28 0 0.00% 0 0.0% 20 71.4% 8
Queensland 1,026 1,026 0 0.00% 6 0.6% 803 78.3% 217
South Australia 438 438 0 0.00% 4 0.9% 402 91.8% 32
Tasmania 207 207 0 0.00% 10 4.8% 117 56.5% 80
Victoria 1,343 1,346 3 0.22% 16 1.2% 1,262 93.8% 68
Western Australia 548 549 1 0.18% 8 1.5% 478 87.1% 63
Total 6,677 6,694 17 0.25% 82 1.2% 5,373 80.3% 1,239

Plot of Current Australian Values

ausStatsPlot <- ggplotly(
  confirmed %>%
    dplyr::filter(Country == "Australia", confirmed > 0) %>%
    left_join(deaths) %>%
    left_join(recovered) %>%
    group_by(Country, date) %>%
    summarise_at(vars(confirmed, deaths, recovered), sum) %>%
    ungroup() %>%
    mutate_at(vars(confirmed, deaths, recovered), cummax) %>%
    mutate(active = confirmed - deaths - recovered) %>%
    pivot_longer(
      cols = c(active, confirmed, deaths, recovered),
      names_to = "Type",
      values_to = "Total"
    ) %>%
    arrange(Type, date) %>%
    dplyr::filter(date > ymd("2020-02-29")) %>%
    mutate(
      Type = str_to_title(Type),
      Type = str_replace_all(Type, "Deaths", "Fatalities"),
    ) %>%
    dplyr::filter(Total > 0) %>%
    rename_all(str_to_title) %>%
    ggplot(aes(Date, Total, colour = Type)) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, size = 1/2) +
    scale_y_log10() +
    scale_colour_manual(
      values = c(
        Active = rgb(0, 0, 0),
        Confirmed = rgb(0, 0.3, 0.7),
        Fatalities = rgb(0.8, 0.2, 0.2),
        Recovered = rgb(0.2, 0.7, 0.4)
      )
    )
)
ausStatsPlot$x$data[5:8] %<>%
  lapply(function(x){
    x$hoverinfo <- "none"
    x
  })
ausStatsCap <- "*Current confirmed and recovered cases, along with fatalities for Australia only. Active cases are shown as confirmed cases excluding fatalities and those classed as recovered. Loess curves through all points are shown as continuous lines. Data is only shown from 1^st^ March 2020 as this was the date of the first recorded fatality in Australia. Recovered patient information was also sparse in the early stages of data collection, and as a result estimates of active infections will be a significant underestimate until 6^th^ April. In particular, QLD only began reporting recovered cases on this date. NSW followed a fornight after this date and as such, only the most recent numbers can be considered as accurate. Below this plot, the same figures can be seen broken down by state.*"
ausStatsPlot

Current confirmed and recovered cases, along with fatalities for Australia only. Active cases are shown as confirmed cases excluding fatalities and those classed as recovered. Loess curves through all points are shown as continuous lines. Data is only shown from 1st March 2020 as this was the date of the first recorded fatality in Australia. Recovered patient information was also sparse in the early stages of data collection, and as a result estimates of active infections will be a significant underestimate until 6th April. In particular, QLD only began reporting recovered cases on this date. NSW followed a fornight after this date and as such, only the most recent numbers can be considered as accurate. Below this plot, the same figures can be seen broken down by state.

ggplotly(
  confirmed %>%
    dplyr::filter(Country == "Australia") %>%
    left_join(deaths) %>%
    left_join(recovered) %>%
    rename(State = `Province/State`) %>%
    dplyr::filter(
      date > ymd("2020-03-19"),
    ) %>%
    arrange(date) %>%
    group_by(State) %>%
    mutate_at(
      vars(confirmed, recovered, deaths), cummax
    ) %>%
    ungroup() %>%
    left_join(ausPops) %>%
    mutate(active = confirmed - deaths - recovered) %>%
    pivot_longer(
      cols = c(confirmed, deaths, recovered, active),
      names_to = "status",
      values_to = "count"
    ) %>%
    dplyr::filter(
      count > 0,
      !(State %in% c("Queensland", "New South Wales") & status == "recovered" & date < ymd("2020-04-06")),
      !(State %in% c("South Australia") & status == "recovered" & date < ymd("2020-04-01")),    
      !(State %in% c("Tasmania") & status == "recovered" & date < ymd("2020-04-02")),
    ) %>%
    mutate(
      rate = 1e6*count/Population,
      rate = round(rate, 2),
      status = str_replace(status, "deaths", "fatal") %>% str_to_title(),
      status = factor(status, levels = c("Confirmed", "Active", "Recovered", "Fatal"))
    ) %>%
    rename_all(str_to_title) %>%
    ggplot(aes(Date, Rate, colour = Status, label = Count)) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, size = 1/2) +
    scale_y_log10() +
    scale_colour_manual(
      values = c(
        Active = rgb(0, 0, 0),
        Confirmed = rgb(0, 0.3, 0.7),
        Fatal = rgb(0.8, 0.2, 0.2),
        Recovered = rgb(0.2, 0.7, 0.4)
      )
    ) +
    facet_wrap(~State, ncol = 4) +
    labs(
      x = "Date",
      y = "Cases / Million",
      colour = "Case Status"
    )
)

Breakdown of individual states. Victorian recovered numbers began to be accurately reported from 22nd March, with other states gradually providing this information. NSW/QLD recovered cases have only recently begun being reported and up until the most recent dates, recovered/active values were very approximate for these states.

State-wise Comparison of Confirmed Infection Rates

minRate <- 5
p5 <- confirmed %>%
  dplyr::filter(
    Country == "Australia"
  ) %>%
  dplyr::rename(State = `Province/State`) %>%
  left_join(ausPops) %>%
  mutate(
    Rate = round(1e6*confirmed / Population, 2),
    Date = format.Date(date, "%d-%B")
  ) %>%
  dplyr::filter(
    !is.na(Population),
    Rate > minRate
  ) %>%
  arrange(date) %>%
  mutate(
    State = fct_inorder(State)
  ) %>%
  dplyr::rename(
    Confirmed = confirmed
  ) %>%
  ggplot(
    aes(
      x = date, y = Rate, colour = State, 
      label = Date, key = Confirmed
      )
  ) +
  geom_point() +
  geom_smooth(
    data = . %>%
      dplyr::filter(date >= dt - 14),
    method = "lm",
    se = FALSE,
    show.legend = FALSE,
    size = 1/2
  ) +
  geom_smooth(
    method = "loess",
    se = FALSE,
    linetype = 3,
    size = 1/2
  ) +
  scale_y_log10() +
  labs(
    x = "Date",
    y = "Confirmed Infection Rate (cases/million)"
  )
# Hide the tooltip from the regression lines
n <- length(levels(p5$data$State))
p5 <- ggplotly(p5, tooltip = c("Date", "Rate", "State", "Confirmed"))
regIndex <- seq(n + 1, length.out = length(p5$x$data) , by = 1)
p5$x$data[regIndex] <- lapply(
  p5$x$data[regIndex],
  function(x){
  x$hoverinfo <- "none"
  x
})
p5

Infection rates for each state with data beginning for each state once 5 confirmed cases /million was exceeded. Regression lines for the last 2 weeks are shown for each state as solid lines, with loess curves through the complete set of daily points shown as dashed lines. The y-axis is on a logarithmic scale indicating exponential growth is occurring. States are shown in order of the initial date they passed 5 cases/million. Most states show a strong upward curve during the week following 19th March, clearly showing the impact of the Ruby Princess, however the consistent deviation below regression lines for all states is strongly suggestive of a slowing in the recent infection rate. The hypothetical growth in the infection rate without that event occurring is not calculated here, however it is clear that the spread of COVID-19 has not continued at the same pace after that event initially occurred.

lm <- confirmed %>%
  dplyr::filter(
    Country == "Australia"
  ) %>%
  dplyr::rename(State = `Province/State`) %>%
  left_join(ausPops) %>%
  mutate(
    Rate = round(1e6*confirmed / Population, 2),
    Date = format.Date(date, "%d-%B")
  ) %>%
  dplyr::filter(
    !is.na(Population),
    Rate > minRate,
    date <= dt,
    date >= dt - 13
  ) %>%
  arrange(desc(confirmed)) %>%
  mutate(
    State = fct_inorder(State)
  ) %>%
  dplyr::rename(
    Confirmed = confirmed
  ) %>%
  arrange(State, Date) %>%
  # distinct(State, Confirmed, .keep_all = TRUE) %>%
  with(
    lm(log10(Rate) ~ (State + date)^2)
  ) 

In order to test whether the infection rates are different between states, a linear regression model was fit. \(\log_{10}\)(Cumulative Confirmed Infection Rate) was assigned as the response variable with predictor variables being the State and Date. Each State was assigned its own intercept and slope by use of an interaction term (i.e. State:date). Given the initially larger case numbers in NSW, this state was set as the baseline, with each other slope (i.e. interaction term) being presented as the difference in slope between each state and NSW. In this way comparisons against NSW were performed, but no comparisons between other states were performed. Currently infecton rates have completely stabilised.

Differences in the State-level intercepts are not particularly relevant and these are not shown. Differences between State-level slopes and NSW however, are of great interest, and as such, only the slopes fr each state are shown. For NSW this captures the actual slope of the daily change in infection rate, whilst for all other States, this represents the difference between the daily change in infection rate for that State in comparison to NSW. Only differences in slope with an Bonferroni-adjusted p-value < 0.05 are of particular interest. For those States which appear to be of interest, a negative value indicates an infection rate increasing more slowly than NSW, whilst a positive value indicates the opposite. If no difference is noted between any state and NSW, it can be considered that these states are all experiencing growth at a similar rate.

st <- shapiro.test(resid(lm))$p.value
bp <- olsrr::ols_test_breusch_pagan(lm)$p
lm %>%
  tidy() %>%
  mutate(
    term = str_remove_all(term, "State")
  ) %>%
  rename(
    Term = term,
    Estimate = estimate,
    SE = std.error,
    `T` = statistic,
    p = p.value
  ) %>%
  dplyr::filter(
    str_detect(Term, "date")
  ) %>%
  mutate(
    Term = str_remove(Term, ":date"),
    Term = str_replace_all(Term, "date", "NSW Daily Increase"),
    adjP = p.adjust(p, method = "bonf"),
    Estimate = sprintf("%.4f", Estimate),
    SE = sprintf("%.3f", SE),
    `T` = sprintf("%.2f", `T`),
    p = case_when(
      p < 1e-4 ~ sprintf("%.2e", p),
      p >= 1e-4 ~ sprintf("%.4f", p)
    ),
    adjP = case_when(
      adjP < 1e-4 ~ sprintf("%.2e", adjP),
      adjP >= 1e-4 ~ sprintf("%.4f", adjP)
    )
  ) %>%
  rename(
    `p~bonf~` = adjP
  ) %>%
  pander(
    justify = "lrrrrr",
    emphasize.strong.rows = which(
      as.numeric(.$`p~bonf~`) < 0.1 & .$Term != "date"
    ),
    caption = paste(
      "*Results of linear regression analysis for the last 14 days only,",
      "comparing the slopes of lines which track __daily",
      "change in cumulative confirmed infection rates__,",
      "scaled for population size in each state.",
      "Intercept terms are not shown.",
      "The daily slope of the NSW regression line is given as the first value, with any differences between this line and other states shown as the subsequent rows.",
      "All slopes and differences in slopes are given on the log~10~ scale.",
      "Any highlighted state indicates an infection rate which is increasing at a different daily rate to NSW.",
      "Positive values in the Estimate column indicate a greater rate the NSW, whilst a negative value in the Estimate column indicates a slower growth rate than NSW.",
      "The fitted model was also checked using the Shapiro-Wilk Test for normality",
      paste0(
        "(p = ",
        sprintf( "%.4f", st),
        "). ",
        ifelse(st < 0.05, "The current results indicate potential violations of the assumption of normality and results in this table __should be taken with caution__.", "No issues with the assumption of normality were noted.")
      ),
      paste0(
        "Heteroscedasticity was tested using the Breusch-Pagan test (p = ",
        sprintf( "%.4f", bp),
        "). ",
        ifelse(bp < 0.05, "Given that unequal variances were detected, results and predictions from this model __should be taken with caution__* The source of this unequal variance is currently attributable to the outbreak in Tasmania.", "No issues with the assumption of constant variance were noted.*")
      )
    )
  )
Results of linear regression analysis for the last 14 days only, comparing the slopes of lines which track daily change in cumulative confirmed infection rates, scaled for population size in each state. Intercept terms are not shown. The daily slope of the NSW regression line is given as the first value, with any differences between this line and other states shown as the subsequent rows. All slopes and differences in slopes are given on the log10 scale. Any highlighted state indicates an infection rate which is increasing at a different daily rate to NSW. Positive values in the Estimate column indicate a greater rate the NSW, whilst a negative value in the Estimate column indicates a slower growth rate than NSW. The fitted model was also checked using the Shapiro-Wilk Test for normality (p = 0.0000). The current results indicate potential violations of the assumption of normality and results in this table should be taken with caution. Heteroscedasticity was tested using the Breusch-Pagan test (p = 0.0000). Given that unequal variances were detected, results and predictions from this model should be taken with caution The source of this unequal variance is currently attributable to the outbreak in Tasmania.
Term Estimate SE T p pbonf
NSW Daily Increase 0.0017 0.001 2.97 0.0038 0.0303
Victoria 0.0003 0.001 0.31 0.7541 1.0000
Queensland -0.0002 0.001 -0.23 0.8160 1.0000
Western Australia 0.0004 0.001 0.55 0.5809 1.0000
South Australia -0.0010 0.001 -1.23 0.2226 1.0000
Tasmania 0.0122 0.001 15.28 1.89e-27 1.51e-26
Australian Capital Territory -0.0009 0.001 -1.12 0.2652 1.0000
Northern Territory -0.0017 0.001 -2.10 0.0385 0.3079

Assessment of Testing Within Each State

latestAU %>%
  dplyr::select(State, Tested = tested) %>%
  write_tsv(
    glue(
      "tested/tested_{format(dt, '%Y%m%d')}.tsv"
    )
  )

Testing numbers were initially sourced from manual inspection of individual press releases for NSW, QLD, VIC, WA, SA, TAS, ACT and the NT. Updates on testing numbers beyond the initial values were performed automatically using the above code which probes each state’s official releases for the latest values, and updates where found. The number of tested individuals in each state was then assessed as a function of State population size. All results are valid at the time of report generation.

latestAU %>%
  dplyr::select(State, Tested = tested) %>%
  full_join(
    confirmed %>%
      dplyr::filter(
        date == max(date),
        Country == "Australia"
      ) %>%
      rename(
        State = `Province/State`
      ) 
  ) %>%
  mutate(
    Tested = case_when(
      is.na(Tested) ~ confirmed,
      Tested < confirmed ~ confirmed,
      !is.na(Tested) ~ Tested
    )
  ) %>%
  left_join(ausPops) %>%
  bind_rows(
    tibble(
      State = "Total",
      Population = sum(.$Population, na.rm = TRUE),
      confirmed = sum(.$confirmed, na.rm = TRUE),
      Tested = sum(.$Tested, na.rm = TRUE)
    )
  ) %>%
  mutate(
    `Tests / '000` = round(1e3 * Tested / Population, 2),
    Positive = confirmed / Tested,
    Negative = 1 - Positive,
    isTotal = grepl("Total", State)
  ) %>%
  dplyr::select(
    State, Population,
    Confirmed = confirmed,
    Tested, 
    contains("000"), 
    ends_with("ive"),
    isTotal
  ) %>%
  arrange(isTotal, desc(`Tests / '000`)) %>%
  dplyr::select(-isTotal) %>%
  dplyr::rename(
    `% Positive Tests` = Positive,
    `% Negative Tests` = Negative
  ) %>%
  mutate_at(
    vars(starts_with("%")), percent, accuracy = 0.01
  ) %>%
  pander(
    justify = "lrrrrrr",
    missing = "",
    caption = glue(
      "*COVID-19 testing scaled by state population size.
      Confirmed cases are assumed to be the tests returning a positive result.
      The current numbers available for some states are a lower limit, and as such, the proportion of the population tested is likely to be higher, as is the proportion of tests returning a negative result.*"
    ),
    emphasize.strong.rows = nrow(.)
  )
COVID-19 testing scaled by state population size. Confirmed cases are assumed to be the tests returning a positive result. The current numbers available for some states are a lower limit, and as such, the proportion of the population tested is likely to be higher, as is the proportion of tests returning a negative result.
State Population Confirmed Tested Tests / ’000 % Positive Tests % Negative Tests
South Australia 1,756,494 438 50,000 28.47 0.88% 99.12%
New South Wales 8,117,976 2,994 190,262 23.44 1.57% 98.43%
Queensland 5,115,451 1,026 97,057 18.97 1.06% 98.94%
Australian Capital Territory 428,060 106 7,738 18.08 1.37% 98.63%
Northern Territory 245,562 28 4,045 16.47 0.69% 99.31%
Tasmania 535,500 207 7,983 14.91 2.59% 97.41%
Victoria 6,629,870 1,346 98,000 14.78 1.37% 98.63%
Western Australia 2,630,557 549 34,199 13 1.61% 98.39%
Total 25,459,470 6,694 489,284 19.22 1.37% 98.63%

Current Growth Factor

list(
  confirmed %>%
    dplyr::filter(Country == "Australia") %>%
    arrange(date) %>%
    group_by(
      `Province/State`
    ) %>%
    mutate(
      new = c(0, diff(confirmed)),
      new_ma = sma(new, 7)
    ) %>%
    dplyr::filter(confirmed > 0, !is.na(new_ma)) %>%
    mutate(
      R = c(NA, new_ma[-1] / new_ma[-n()]),
      R = case_when(
        is.nan(R) ~ 0,
        !is.nan(R) ~ R
      )
    ) %>%
    ungroup() %>%
    arrange(`Province/State`),
  confirmed %>%
    dplyr::filter(Country == "Australia") %>%
    arrange(date) %>%
    group_by(Country, date) %>%
    summarise_at(vars(confirmed), sum) %>%
    ungroup() %>%
    mutate(
      new = c(0, diff(confirmed)),
      new_ma = sma(new, 7)
    ) %>%
    dplyr::filter(confirmed > 0, !is.na(new_ma)) %>%
    mutate(
      R = c(NA, new_ma[-1] / new_ma[-n()]),
      R = case_when(
        is.nan(R) ~ 0,
        !is.nan(R) ~ R
      ),
      `Province/State` = "All States"
    ) %>%
    arrange(`Province/State`)
) %>%
  bind_rows() %>%
  dplyr::filter(date > ymd("2020-03-15")) %>%
  ggplot(aes(date, R, colour = `Province/State`)) +
  geom_ribbon(aes(ymin = 1, ymax = R), alpha = 0.1) +
  geom_hline(yintercept = 1) +
  geom_smooth(se = FALSE, linetype = 2, size = 1/3) +
  geom_label(
    aes(label = R),
    data = . %>%
      dplyr::filter(date == max(date)) %>%
      mutate(R = round(R, 2), date = date + 1),
    fill = rgb(1, 1, 1, 0.3),
    show.legend = FALSE
  ) +
  labs(
    x = "Date", y = "Growth Factor"
  ) +
  facet_wrap(~`Province/State`, scales = "free_x") +
  theme(legend.position = "none")
*Growth factor for each State/Territory. This value becomes very volatile when daily new cases approach zero as is commonly observed in small populations and the end stages of an outbreak. In order to try and minimise volatility a 7 day moving average was used, in contrast to the 5-day average advocated [here](https://www.abc.net.au/news/2020-04-10/coronavirus-data-australia-growth-factor-covid-19/12132478)*

Growth factor for each State/Territory. This value becomes very volatile when daily new cases approach zero as is commonly observed in small populations and the end stages of an outbreak. In order to try and minimise volatility a 7 day moving average was used, in contrast to the 5-day average advocated here

R Session Information

R version 3.6.3 (2020-02-29)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: cowplot(v.1.0.0), plotly(v.4.9.2.1), DT(v.0.13), pander(v.0.6.3), RCurl(v.1.98-1.1), rvest(v.0.3.5), xml2(v.1.3.1), jsonlite(v.1.6.1), glue(v.1.4.0), broom(v.0.5.5), ggrepel(v.0.8.2), matrixStats(v.0.56.0), scales(v.1.1.0), lubridate(v.1.7.8), magrittr(v.1.5), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.0.8.5), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.2), tibble(v.3.0.0), ggplot2(v.3.3.0) and tidyverse(v.1.3.0)

loaded via a namespace (and not attached): nlme(v.3.1-147), bitops(v.1.0-6), fs(v.1.4.1), httr(v.1.4.1), rprojroot(v.1.3-2), tools(v.3.6.3), backports(v.1.1.6), R6(v.2.4.1), nortest(v.1.0-4), DBI(v.1.1.0), lazyeval(v.0.2.2), mgcv(v.1.8-31), colorspace(v.1.4-1), withr(v.2.1.2), gridExtra(v.2.3), tidyselect(v.1.0.0), curl(v.4.3), compiler(v.3.6.3), cli(v.2.0.2), Cairo(v.1.5-12), labeling(v.0.3), goftest(v.1.2-2), digest(v.0.6.25), foreign(v.0.8-76), rmarkdown(v.2.1), rio(v.0.5.16), pkgconfig(v.2.0.3), htmltools(v.0.4.0), dbplyr(v.1.4.2), highr(v.0.8), htmlwidgets(v.1.5.1), rlang(v.0.4.5), readxl(v.1.3.1), rstudioapi(v.0.11), farver(v.2.0.3), generics(v.0.0.2), crosstalk(v.1.1.0.1), zip(v.2.0.4), car(v.3.0-7), Matrix(v.1.2-18), Rcpp(v.1.0.4.6), munsell(v.0.5.0), fansi(v.0.4.1), abind(v.1.4-5), lifecycle(v.0.2.0), stringi(v.1.4.6), yaml(v.2.2.1), carData(v.3.0-3), MASS(v.7.3-51.5), grid(v.3.6.3), crayon(v.1.3.4), lattice(v.0.20-41), haven(v.2.2.0), splines(v.3.6.3), hms(v.0.5.3), knitr(v.1.28), pillar(v.1.4.3), reprex(v.0.3.0), evaluate(v.0.14), data.table(v.1.12.8), modelr(v.0.1.6), olsrr(v.0.5.3), vctrs(v.0.2.4), selectr(v.0.4-2), cellranger(v.1.1.0), gtable(v.0.3.0), assertthat(v.0.2.1), openxlsx(v.4.1.4), xfun(v.0.13), viridisLite(v.0.3.0), ellipsis(v.0.3.0) and here(v.0.1)